Load necessary libraries

library(tidyverse) 
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(sf)
## Linking to GEOS 3.11.2, GDAL 3.6.2, PROJ 9.2.0; sf_use_s2() is TRUE
library(ggplot2)
library(ggmap)
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, will retire in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## The sp package is now running under evolution status 2
##      (status 2 uses the sf package in place of rgdal)
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggmap':
## 
##     wind
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

Load the dataset

url <- "https://raw.githubusercontent.com/cema-uonbi/internship_task/main/data/cema_internship_task_2023.csv"
data <- read.csv(url)
head(data)
##   period                 county Total.Dewormed Acute.Malnutrition
## 1 Jan-23         Baringo County           3659                  8
## 2 Jan-23           Bomet County           1580                 NA
## 3 Jan-23         Bungoma County           6590                 24
## 4 Jan-23           Busia County           7564                 NA
## 5 Jan-23 Elgeyo Marakwet County           1407                 NA
## 6 Jan-23            Embu County           3241                 72
##   stunted.6.23.months stunted.0..6.months stunted.24.59.months diarrhoea.cases
## 1                 471                  34                  380            2620
## 2                   1                   3                   NA            1984
## 3                  98                 154                   23            4576
## 4                 396                 143                  111            2239
## 5                  92                  71                    5            2739
## 6                 326                  86                   24            1376
##   Underweight.0..6.months Underweight.6.23.months Underweight.24.59.Months
## 1                      85                     739                      731
## 2                      41                      86                       16
## 3                     231                     315                      120
## 4                     251                     608                      125
## 5                      57                     104                       21
## 6                     141                     544                      160

Explore the structure and summary statistics of the dataset

# Inspect data 
glimpse(data)
## Rows: 1,410
## Columns: 11
## $ period                   <chr> "Jan-23", "Jan-23", "Jan-23", "Jan-23", "Jan-…
## $ county                   <chr> "Baringo County", "Bomet County", "Bungoma Co…
## $ Total.Dewormed           <int> 3659, 1580, 6590, 7564, 1407, 3241, 6751, 469…
## $ Acute.Malnutrition       <int> 8, NA, 24, NA, NA, 72, 250, 9, 26, 104, 36, N…
## $ stunted.6.23.months      <int> 471, 1, 98, 396, 92, 326, 40, 209, 51, 319, 2…
## $ stunted.0..6.months      <int> 34, 3, 154, 143, 71, 86, 13, 87, 6, 102, 279,…
## $ stunted.24.59.months     <int> 380, NA, 23, 111, 5, 24, 99, 58, 50, 155, 292…
## $ diarrhoea.cases          <int> 2620, 1984, 4576, 2239, 2739, 1376, 2314, 278…
## $ Underweight.0..6.months  <int> 85, 41, 231, 251, 57, 141, 223, 140, 13, 139,…
## $ Underweight.6.23.months  <dbl> 739, 86, 315, 608, 104, 544, 1856, 298, 180, …
## $ Underweight.24.59.Months <dbl> 731, 16, 120, 125, 21, 160, 1833, 84, 271, 57…
summary(data)
##     period             county          Total.Dewormed   Acute.Malnutrition
##  Length:1410        Length:1410        Min.   :    97   Min.   :   1.0    
##  Class :character   Class :character   1st Qu.:  2454   1st Qu.:  15.0    
##  Mode  :character   Mode  :character   Median :  4564   Median :  39.0    
##                                        Mean   : 11458   Mean   : 125.4    
##                                        3rd Qu.:  8222   3rd Qu.: 143.5    
##                                        Max.   :392800   Max.   :4123.0    
##                                                         NA's   :355       
##  stunted.6.23.months stunted.0..6.months stunted.24.59.months diarrhoea.cases
##  Min.   :   1.0      Min.   :   1.0      Min.   :   1.0       Min.   :  198  
##  1st Qu.:  69.5      1st Qu.:  36.5      1st Qu.:  22.0       1st Qu.: 1464  
##  Median : 159.0      Median :  84.0      Median :  50.0       Median : 2158  
##  Mean   : 280.2      Mean   : 139.8      Mean   : 110.8       Mean   : 2813  
##  3rd Qu.: 328.5      3rd Qu.: 157.0      3rd Qu.: 114.2       3rd Qu.: 3335  
##  Max.   :4398.0      Max.   :7900.0      Max.   :3169.0       Max.   :15795  
##  NA's   :11          NA's   :19          NA's   :14                          
##  Underweight.0..6.months Underweight.6.23.months Underweight.24.59.Months
##  Min.   :   6.0          Min.   :  16.0          Min.   :   1.00         
##  1st Qu.:  87.0          1st Qu.: 249.0          1st Qu.:  51.25         
##  Median : 162.5          Median : 456.0          Median : 120.50         
##  Mean   : 223.5          Mean   : 652.3          Mean   : 305.74         
##  3rd Qu.: 272.8          3rd Qu.: 791.8          3rd Qu.: 311.00         
##  Max.   :1937.0          Max.   :5348.0          Max.   :4680.00         
## 

Handling missing value

# Fill in missing values for the columns with their mean
data <- data %>%
  mutate(
    Total.Dewormed = ifelse(is.na(Total.Dewormed), mean(Total.Dewormed, na.rm = TRUE), Total.Dewormed),
    Acute.Malnutrition = ifelse(is.na(Acute.Malnutrition), mean(Acute.Malnutrition, na.rm = TRUE), Acute.Malnutrition),
    stunted.6.23.months = ifelse(is.na(stunted.6.23.months), mean(stunted.6.23.months, na.rm = TRUE), stunted.6.23.months),
    stunted.0..6.months = ifelse(is.na(stunted.0..6.months), mean(stunted.0..6.months, na.rm = TRUE), stunted.0..6.months),
    stunted.24.59.months = ifelse(is.na(stunted.24.59.months), mean(stunted.24.59.months, na.rm = TRUE), stunted.24.59.months),
    diarrhoea.cases = ifelse(is.na(diarrhoea.cases), mean(diarrhoea.cases, na.rm = TRUE), diarrhoea.cases),
    Underweight.0..6.months = ifelse(is.na(Underweight.0..6.months), mean(Underweight.0..6.months, na.rm = TRUE), Underweight.0..6.months),
    Underweight.6.23.months = ifelse(is.na(Underweight.6.23.months), mean(Underweight.6.23.months, na.rm = TRUE), Underweight.6.23.months),
    Underweight.24.59.Months = ifelse(is.na(Underweight.24.59.Months), mean(Underweight.24.59.Months, na.rm = TRUE), Underweight.24.59.Months)
  )

# Inspect the filled data
glimpse(data)
## Rows: 1,410
## Columns: 11
## $ period                   <chr> "Jan-23", "Jan-23", "Jan-23", "Jan-23", "Jan-…
## $ county                   <chr> "Baringo County", "Bomet County", "Bungoma Co…
## $ Total.Dewormed           <int> 3659, 1580, 6590, 7564, 1407, 3241, 6751, 469…
## $ Acute.Malnutrition       <dbl> 8.0, 125.4, 24.0, 125.4, 125.4, 72.0, 250.0, …
## $ stunted.6.23.months      <dbl> 471, 1, 98, 396, 92, 326, 40, 209, 51, 319, 2…
## $ stunted.0..6.months      <dbl> 34, 3, 154, 143, 71, 86, 13, 87, 6, 102, 279,…
## $ stunted.24.59.months     <dbl> 380.000, 110.765, 23.000, 111.000, 5.000, 24.…
## $ diarrhoea.cases          <int> 2620, 1984, 4576, 2239, 2739, 1376, 2314, 278…
## $ Underweight.0..6.months  <int> 85, 41, 231, 251, 57, 141, 223, 140, 13, 139,…
## $ Underweight.6.23.months  <dbl> 739, 86, 315, 608, 104, 544, 1856, 298, 180, …
## $ Underweight.24.59.Months <dbl> 731, 16, 120, 125, 21, 160, 1833, 84, 271, 57…

Transform to tidy format

data_tidy <- data %>% 
  pivot_longer(c(Total.Dewormed,Acute.Malnutrition,stunted.6.23.months,stunted.0..6.months,stunted.24.59.months,diarrhoea.cases,Underweight.0..6.months,Underweight.6.23.months,Underweight.24.59.Months), 
               names_to = "variable", 
               values_to = "value")

Visualize with histograms

Create interactive histograms using plotly

p <- ggplot(data_tidy, aes(x = value)) +
  geom_histogram() +
  facet_wrap(~variable, scales = "free") +
  theme_minimal()
ggplotly(p)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Visualize with boxplots

Create interactive boxplots using plotly

p <- ggplot(data_tidy, aes(x = period, y = value, color = variable)) + 
  geom_boxplot() +
  theme_minimal()+
  coord_flip()
ggplotly(p)

Join with spatial data

kenya_sf <- st_read("County.shp")
## Reading layer `County' from data source 
##   `C:\Users\STREET_CODER\Documents\CEMA-Intership\County.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 47 features and 8 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 33.91028 ymin: -4.798828 xmax: 41.90613 ymax: 5.414124
## Geodetic CRS:  WGS 84
data$county <- gsub(" County$", "", data$county)
data_sf <- left_join(kenya_sf, data, by = c("Name" = "county"))

Map acute malnutrition

Create an interactive choropleth map using plotly

State research question

“What factors are associated with high rates of acute malnutrition across counties?”

Statistical analysis e.g. correlation, regression

Calculate correlation between Acute Malnutrition and Dewormed

correlation <- cor(data_sf$`Acute.Malnutrition`, data_sf$Total.Dewormed)
print("Correlation coefficient between Acute Malnutrition and Dewormed:")
## [1] "Correlation coefficient between Acute Malnutrition and Dewormed:"
print(correlation)
## [1] 0.07208518

Fit a linear regression model to explore the relationship between Acute Malnutrition and Dewormed

linear_model <- lm(Acute.Malnutrition ~ Total.Dewormed, data = data_sf)
summary(linear_model)
## 
## Call:
## lm(formula = Acute.Malnutrition ~ Total.Dewormed, data = data_sf)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -213.3 -102.3  -36.7    6.6 3983.2 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.179e+02  6.720e+00  17.544  < 2e-16 ***
## Total.Dewormed 6.548e-04  2.415e-04   2.712  0.00677 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 230 on 1408 degrees of freedom
## Multiple R-squared:  0.005196,   Adjusted R-squared:  0.00449 
## F-statistic: 7.355 on 1 and 1408 DF,  p-value: 0.006771

Graphs showing the relationship between Acute Malnutrition and Dewormed

Create interactive scatter plot with a regression line using plotly

p <- ggplot(data_sf, aes(x = Total.Dewormed, y = Acute.Malnutrition)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "Dewormed", y = "Acute Malnutrition") +
  theme_minimal()
ggplotly(p)
## `geom_smooth()` using formula = 'y ~ x'

Create interactive box plot using plotly

p <- ggplot(data_sf, aes(x = Total.Dewormed, y = Acute.Malnutrition)) +
  geom_boxplot() +
  labs(x = "Dewormed", y = "Acute Malnutrition") +
  theme_minimal()
ggplotly(p)
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?